{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Classifying documents with Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From: gregl@zimmer.CSUFresno.EDU (Greg Lewis)\n",
      "Subject: Re: WARNING.....(please read)...\n",
      "Keywords: BRICK, TRUCK, DANGER\n",
      "Nntp-Posting-Host: zimmer.csufresno.edu\n",
      "Organization: CSU Fresno\n",
      "Lines: 33\n",
      "\n",
      "In article <1qh336INNfl5@CS.UTK.EDU> larose@austin.cs.utk.edu (Brian LaRose) writes:\n",
      ">This just a warning to EVERYBODY on the net.  Watch out for\n",
      ">folks standing NEXT to the road or on overpasses.   They can\n",
      ">cause SERIOUS HARM to you and your car.  \n",
      ">\n",
      ">(just a cliff-notes version of my story follows)\n",
      ">\n",
      ">10pm last night, I was travelling on the interstate here in\n",
      ">knoxville,  I was taking an offramp exit to another interstate\n",
      ">and my wife suddenly screamed and something LARGE hit the side\n",
      ">of my truck.  We slowed down, but after looking back to see the\n",
      ">vandals standing there, we drove on to the police station.\n",
      ">\n",
      ">She did get a good look at the guy and saw him \"cock his arm\" with\n",
      ">something the size of a cinderblock, BUT I never saw him. We are \n",
      ">VERY lucky the truck sits up high on the road; if it would have hit\n",
      ">her window, it would have killed her. \n",
      ">\n",
      ">The police are looking for the guy, but in all likelyhood he is gone. \n",
      "Stuff deleted...\n",
      "\n",
      "I am sorry to report that in Southern California it was a sick sport\n",
      "for a while to drop concrete blocks from the overpasses onto the\n",
      "freeway.  Several persons were killed when said blocks came through\n",
      "their windshields.  Many overpass bridges are now fenced, and they\n",
      "have made it illegal to loiter on such bridges (as if that would stop\n",
      "such people).  Yet many bridges are NOT fenced.  I always look up at a\n",
      "bridge while I still have time to take evasive action even though this\n",
      "*sport* has not reached us here in Fresno.\n",
      "___________________________________________________________________\n",
      "Greg_Lewis@csufresno.edu\n",
      "Photojournalism sequence, Department of Journalism\n",
      "CSU Fresno, Fresno, CA 93740\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "categories = [\"rec.autos\", \"rec.motorcycles\"]\n",
    "newgroups = fetch_20newsgroups(categories=categories)\n",
    "#take a look\n",
    "print \"\\n\".join(newgroups.data[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['rec.autos', 'rec.motorcycles']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newgroups.target_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "count_vec = CountVectorizer()\n",
    "bow = count_vec.fit_transform(newgroups.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1192x19177 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 164296 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "bow = np.array(bow.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([u'10pm', u'1qh336innfl5', u'33', u'93740',\n",
       "       u'___________________________________________________________________'], \n",
       "      dtype='<U79')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = np.array(count_vec.get_feature_names())\n",
    "words[bow[0] > 0][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'10pm' in newgroups.data[0].lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'1qh336innfl5' in newgroups.data[0].lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import naive_bayes\n",
    "clf = naive_bayes.GaussianNB().fit(X_train, y_train)\n",
    "\n",
    "X = bow\n",
    "y = newgroups.target\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.94798657718120805"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "accuracy_score(y_test,clf.predict(X_test) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "mn_categories = [\"rec.autos\", \"rec.motorcycles\", \"talk.politics.guns\"]\n",
    "mn_newgroups = fetch_20newsgroups(categories=mn_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.95397008055235899"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mn_bow = count_vec.fit_transform(mn_newgroups.data)\n",
    "mn_bow = np.array(mn_bow.todense())\n",
    "\n",
    "X = mn_bow\n",
    "y = mn_newgroups.target\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,stratify=y)\n",
    "\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = MultinomialNB().fit(X_train, y_train)\n",
    "\n",
    "from sklearn.metrics import accuracy_score\n",
    "accuracy_score(y_test,clf.predict(X_test) )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
